# Tweet Clusters

### Environment Prep

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import re
import glob
import json

import time

import nltk

from sklearn.externals import joblib

Read all json files, append to a list, and concat into a dataframe called "tweets"

In [2]:
read_files = glob.glob("data/*.json")
print(read_files)
print() ; 

files = []
for file in read_files:
    files.append(pd.read_json(file))
    print('Appending ' + file + '...')

print() ; 
tweets = pd.concat(files)
print(tweets.shape)

['data/mar01_apr01.json', 'data/nov01_dec01.json', 'data/jan01_feb01.json', 'data/feb01_mar01.json', 'data/apr01_apr14.json', 'data/dec01_jan01.json', 'data/oct20_nov01.json', 'data/oct14_oct20.json']

Appending data/mar01_apr01.json...
Appending data/nov01_dec01.json...
Appending data/jan01_feb01.json...
Appending data/feb01_mar01.json...
Appending data/apr01_apr14.json...
Appending data/dec01_jan01.json...
Appending data/oct20_nov01.json...
Appending data/oct14_oct20.json...

(1392076, 10)


In [3]:
# print(tweets.head(3))

In [4]:
data = tweets

### Clean the data

In [5]:
data['original_text'] = data['text']

In [6]:
# 'http' matches literal characters
# '\S+' matches all non-whitespace characters (the end of the url)

# remove http links
data['text'] = data['text'].str.replace('http\S+', '', case=False)

# remove www links
data['text'] = data['text'].str.replace('www.\S+', '', case=False)

# remove twitter pics
data['text'] = data['text'].str.replace('pic.twitter.com\S+', '', case=False)

In [7]:
# create a function to search for hashtags
def hashtagSearch(text):
    array = []
    for word in text.split(' '):
        if len(word) > 1:
            if word[0] =='#':
                array.append(word)
    return(array)

# make new column of hashtags
data['hashtags'] = data['text'].apply(hashtagSearch)
# print(data.head)

In [8]:
# create a function to search for usernames
def usernameSearch(text):
    array = []
    for word in text.split(' '):
        if len(word) > 1:
            if word[0] =='@':
                array.append(word)
    return(array)

# make new column of hashtags
data['usernames'] = data['text'].apply(usernameSearch)
# print(data.head)

In [9]:
# remove hashtags
data['text'] = data['text'].str.replace('#\S+', '', case=False)

# remove users
data['text'] = data['text'].str.replace('@\S+', '', case=False)

# remove …
data['text'] = data['text'].str.replace('…','')

In [10]:
tweets['engagement'] = tweets['likes'] + tweets['replies'] + 2*tweets['retweets']

In [15]:
print(data.head())
joblib.dump(data, 'data.pkl')

        fullname                                               html  \
0    Judy Gayton  <p class="TweetTextSize js-tweet-text tweet-te...   
1          julie  <p class="TweetTextSize js-tweet-text tweet-te...   
2  Rachel Powell  <p class="TweetTextSize js-tweet-text tweet-te...   
3  Kevin Johnson  <p class="TweetTextSize js-tweet-text tweet-te...   
4   JohnnyZipper  <p class="TweetTextSize js-tweet-text tweet-te...   

                   id  likes  replies  retweets  \
0  980233135471214592      6        1        10   
1  980233120401240064      0        0         0   
2  980233027103072256      0        0         0   
3  980232921100570624      0        0         0   
4  980232826523193344      1        0         0   

                                                text           timestamp  \
0  Despite 45% being sexually assaulted/raped (22... 2018-03-31 23:59:26   
1                     How Easter became a  moment\n  2018-03-31 23:59:22   
2          Praying and grateful for yo

['data.pkl']

### Tokenize and Stem

In [16]:
# nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [17]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [18]:
totalvocab_stemmed = []
totalvocab_tokenized = []

for i in data.text:
    allwords_stemmed = tokenize_and_stem(i) # for each item in 'data.text', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) # extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [19]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
print()
print(totalvocab_stemmed[0:40])
print()
# print(vocab_frame.head(30))

there are 26193288 items in vocab_frame

['despit', 'be', 'sexual', 'assaulted/rap', 'by', 'staff', 'onli', 'report', 'to', 'polic', 'render', 'psych', 'ward', 'one', 'of', 'the', 'most', 'danger', 'place', 'imagin', 'mental', 'health', 'minist', 'foley', 'resist', 'women-on', 'ward', 'is', 'unaccept', 'how', 'easter', 'becam', 'a', 'moment', 'pray', 'and', 'grate', 'for', 'your', 'strength']



### Tf-idf Vectorizer

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    analyzer='word', 
    tokenizer=tokenize_and_stem,
    stop_words='english', 
    min_df=0.0050, # word must be present in 0.5% of documents, or 50 documents of 10,000
    max_df=0.9900, # word cannot be present in 99% of documents, or 9,900 documents of 10,000
    ngram_range=(1,3)
)

X_tfidf_v = tfidf_vectorizer.fit_transform(data.text)

print(X_tfidf_v.shape)

(1392076, 348)


In [21]:
# print(tfidf_vectorizer.stop_words_)

In [22]:
print(tfidf_vectorizer.vocabulary_)

{'sexual': 258, 'onli': 212, 'report': 244, 'place': 224, 'moment': 196, 'say': 254, 'anoth': 21, 'abus': 2, 'articl': 25, 'claim': 61, 'stori': 287, 'like': 173, 'true': 306, "n't": 201, 'help': 137, 'feel': 105, 'ani': 20, 'white': 331, 'way': 326, 'address': 9, 'girl': 119, 'victim': 317, 'state': 283, 'movement': 199, 's': 251, 'think': 297, 'matter': 188, 'rape': 237, 'import': 146, 'everyon': 97, 'resign': 245, "'s": 1, 'assault': 27, 'sexual assault': 260, 'touch': 304, 'end': 93, 'watch': 325, 'campaign': 51, 'media': 191, 'need': 202, 'democrat': 80, 'mean': 190, 'told': 302, 'did': 82, "did n't": 83, 'share': 265, 'share stori': 266, 'time': 299, 'said': 253, 'alleg': 13, 'ca': 49, 'believ': 37, "ca n't": 50, 'real': 240, 'make': 183, 'veri': 316, 'feminist': 107, 'polit': 228, 'power': 230, 'world': 341, 'old': 211, 'whi': 330, 'twitter': 311, 'protect': 234, 'tweet': 310, 'account': 3, 'check': 58, 'becaus': 33, 'harass': 130, 'school': 255, 'shame': 264, 'person': 221, 'ac

In [23]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

### Run kmeans

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X_tfidf_v)
print(X.shape)

(1392076, 348)


In [25]:
joblib.dump(X, 'X.pkl')

['X.pkl']

In [26]:
from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_samples, silhouette_score

In [27]:
num_clusters = 425
kmeans = KMeans(n_clusters=num_clusters)
model = kmeans.fit(X)

print(kmeans.cluster_centers_)
print(kmeans.labels_.shape)

clusters = kmeans.labels_.tolist()
print(clusters[:10])

[[0.17531691 0.2590698  0.19265249 ... 0.11223534 0.         0.01696422]
 [0.02490946 0.32915857 0.12886114 ... 0.01490448 0.         0.        ]
 [0.         0.26466425 0.17740592 ... 0.00828306 0.         0.        ]
 ...
 [0.0673878  0.47804178 0.228185   ... 0.04230531 0.         0.02199367]
 [0.15965446 0.25703716 0.19190393 ... 0.08934321 0.         0.02339985]
 [0.09279151 0.2521577  0.12918645 ... 0.08269735 0.         0.00608201]]
(1392076,)
[13, 83, 19, 181, 39, 343, 329, 83, 361, 142]


In [28]:
data['cluster'] = clusters
print(data.head(3))

        fullname                                               html  \
0    Judy Gayton  <p class="TweetTextSize js-tweet-text tweet-te...   
1          julie  <p class="TweetTextSize js-tweet-text tweet-te...   
2  Rachel Powell  <p class="TweetTextSize js-tweet-text tweet-te...   

                   id  likes  replies  retweets  \
0  980233135471214592      6        1        10   
1  980233120401240064      0        0         0   
2  980233027103072256      0        0         0   

                                                text           timestamp  \
0  Despite 45% being sexually assaulted/raped (22... 2018-03-31 23:59:26   
1                     How Easter became a  moment\n  2018-03-31 23:59:22   
2          Praying and grateful for your strength!   2018-03-31 23:59:00   

                                         url            user  \
0       /JMGayton1/status/980233135471214592       JMGayton1   
1  /nohumanillegal/status/980233120401240065  nohumanillegal   
2  /MsRachelP

In [29]:
joblib.dump(model, 'kmeans.pkl')

['kmeans.pkl']

In [30]:
joblib.dump(data, 'data_clusters.pkl')

['data_clusters.pkl']

### Explore Results

In [191]:
{i: X[np.where(kmeans.labels_ == i)] for i in range(kmeans.n_clusters)}

{0: <3373x348 sparse matrix of type '<class 'numpy.float64'>'
 	with 17213 stored elements in Compressed Sparse Row format>,
 1: <3093x348 sparse matrix of type '<class 'numpy.float64'>'
 	with 9232 stored elements in Compressed Sparse Row format>,
 2: <765x348 sparse matrix of type '<class 'numpy.float64'>'
 	with 1122 stored elements in Compressed Sparse Row format>,
 3: <3947x348 sparse matrix of type '<class 'numpy.float64'>'
 	with 12162 stored elements in Compressed Sparse Row format>,
 4: <3777x348 sparse matrix of type '<class 'numpy.float64'>'
 	with 15808 stored elements in Compressed Sparse Row format>,
 5: <3109x348 sparse matrix of type '<class 'numpy.float64'>'
 	with 13165 stored elements in Compressed Sparse Row format>,
 6: <3994x348 sparse matrix of type '<class 'numpy.float64'>'
 	with 18555 stored elements in Compressed Sparse Row format>,
 7: <5311x348 sparse matrix of type '<class 'numpy.float64'>'
 	with 67773 stored elements in Compressed Sparse Row format>,
 8:

In [202]:
print(X[np.where(kmeans.labels_ == 0)].shape)
print(X.shape)

(3373, 348)
(1392076, 348)


In [195]:
cluster_count = data.groupby(['cluster'])['cluster'].count()
cluster_count = pd.DataFrame(dict(cluster=cluster_count.index, count=cluster_count.values))

NameError: name 'head' is not defined

In [196]:
cluster_count.head()

Unnamed: 0,cluster,count
0,0,3373
1,1,3093
2,2,765
3,3,3947
4,4,3777


In [32]:
cluster_count.to_csv('cluster_count.csv', index=False)

In [33]:
terms = tfidf_vectorizer.get_feature_names()

In [34]:
from __future__ import print_function

print("Top terms per cluster:")
print()

#sort cluster centers by proximity to centroid
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-5] 


for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
        
    print() #add whitespace
    print() #add whitespace

Top terms per cluster:

Cluster 0 words: finally, hopes, people, story, sexually, started, got, making, believe, truth,

Cluster 1 words: talks, why, women, thing, sexually, did, making, know, person, help,

Cluster 2 words: sexually, woman, thing, forced, shares, men, joke, equally, t, yes,

Cluster 3 words: posts, read, movement, 've, n't, years, shares, man, because, supporting,

Cluster 4 words: industry, changing, n't, female, work, abuse, media, children, issue, sex,

Cluster 5 words: believe, supporting, why, say, trump, movement, like, think, person, assaulted,

Cluster 6 words: star, movement, talks, say, lady, awards, hits, feminist, turns, reported,

Cluster 7 words: come, victims, story, sexually, feel, 's, shares, just, woman, shameful,

Cluster 8 words: don, s, women, needs, just, people, rights, thing, lie, really,

Cluster 9 words: long, thanks, read, times, does, free, girls, feel, every, exposes,

Cluster 10 words: new, new, maybe, world, lie, good, way, trump, fear, 

Cluster 176 words: man, women, s, men, power, sex, times, way, victims, read,

Cluster 177 words: country, every, moment, culture, like, trump, new, just, men, know,

Cluster 178 words: oscars, era, t, accusations, happening, winning, got, posts, supporting, best,

Cluster 179 words: new, golden, industry, harvey, points, address, news, women, state, era,

Cluster 180 words: deserves, nothing, work, cover, stand, 's, believe, years, did, abuse,

Cluster 181 words: another, way, movement, look, come, sexually, sexually, rape, campaign, tell,

Cluster 182 words: shit, know, twitter, needs, latest, hits, guy, use, talks, lot,

Cluster 183 words: just, women, times, abuse, children, shares, many, really, watch, man,

Cluster 184 words: news, 's, harassing, s, new, just, n't, forced, come, tried,

Cluster 185 words: way, work, changing, like, woman, people, 'm, power, moment, years,

Cluster 186 words: sign, women, actresses, sexually, march, sexually, look, know, asking, violence,

Cluster

Cluster 352 words: step, public, hey, fucked, issue, n't, use, sexually, officers, bring,

Cluster 353 words: free, talks, join, 's, movement, speak, help, use, woman, assaulted,

Cluster 354 words: wondering, good, like, think, movement, before, started, because, m, today,

Cluster 355 words: speech, president, message, night, new, great, during, black, look, said,

Cluster 356 words: let, together, stop, just, say, bring, conversation, clear, world, 'm,

Cluster 357 words: sure, n't, thing, many, truth, really, guy, check, use, way,

Cluster 358 words: different, women, finally, gender, during, american, another, know, abuse, fight,

Cluster 359 words: holding, hands, sexually, officers, people, story, tried, survivors, abuse, little,

Cluster 360 words: great, day, times, look, love, men, thought, woman, job, help,

Cluster 361 words: feel, making, just, times, years, needs, happening, sexually, sadly, changing,

Cluster 362 words: workplace, women, issue, power, s, making, new, con

In [258]:
from __future__ import print_function

#sort cluster centers by proximity to centroid
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-5] 

cluster = []
vocab = []

for i in range(num_clusters):
    for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
        cluster.append(i)
        vocab.append(vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0])

print(len(vocab))
print(len(cluster))

4250
4250


In [37]:
# vocab_list = pd.DataFrame({
#     'cluster': cluster,
#     'word': vocab
# })

# # print(vocab_list)

In [38]:
# vocab_list.to_csv('vocab_list.csv', index=False)

In [39]:
# # cluster = data.loc[(data.cluster == 0)]
# # cluster.shape

# for num in range(0,475):
# #     print(num)
# #     print('cluster_' + str(num))
#     cluster = data.loc[(data.cluster == num)]
#     cluster.to_csv('cluster_' + str(num))
# #     print(num.shape)

In [253]:
cluster_test = data.loc[data.cluster.isin([22]),]
filtered_test = cluster_test.nlargest(20, 'engagement')

for index, row in filtered_test.iterrows():
    print(row['original_text'])
    print()
    print(row['engagement'])
    print()
    print('***')

#MeToo
#FakeCases 
My  Friends Story: My Wife Filed A False Litigation Against Me; I Fought The Case And Won Ithttp://fb.me/GWJ8i25K 

335

***
#MeToo movement looms over jury selection in Bill Cosby case http://blbrd.cm/cVQlfe pic.twitter.com/zVts3w97ce

269

***
“In the case of the girls, we should exact a price at some other opportunity, in the dark, without witnesses and cameras”

Ben Caspit - Israeli Journalist 

commenting on #AhadTamimi
#Metoo
#HarveyWeinstein #Justice #Palestinehttps://twitter.com/georgegalloway/status/944955896026877953 …

242

***
Nice. My 1st academic publication! A chapter I wrote for this volume, using @clearlinesuk as a case study for engaged art to #endVAWG - emotional & unpaid labour & the constraints upon #activism #MeToo #TimesUp #phdlifepic.twitter.com/yyhTzGUclj

170

***
So much for #METOO and #TIMESUP: U.S. judge “goes to bat” for FGM in landmark case, reports David Menzies https://www.therebel.media/so_much_for_metoo_and_timesup_u_s_judge_goes_to

In [None]:
# for index, row in data.iterrows():
# if row['cluster'] == 409:
#     print(row['text'])
#     print()
#     print(row['hashtags'])
#     print()
#     print('***')

### Cluster Group 1 = Poltics

In [206]:
cluster_group_1 = data.loc[data.cluster.isin([90,159,274,303,366,398]),]
print(cluster_group_1.shape)

(13623, 15)


In [207]:
filtered_group_1 = cluster_group_1.nlargest(1000, 'engagement')
filtered_group_1.shape

(1000, 15)

In [218]:
politics = pd.DataFrame(data = filtered_group_1, columns = ['id','original_text','user','url','timestamp','likes','retweets','replies','engagement','cluster'])
politics.head()

Unnamed: 0,id,original_text,user,url,timestamp,likes,retweets,replies,engagement,cluster
196938,938642541616009216,Poll: 70% of Americans says Congress should in...,funder,/funder/status/938642541616009216,2017-12-07 05:33:15,7641,10455,237,28788,366
38741,972678963326783488,If #PA18 voters need another reason to vote fo...,joncoopertweets,/joncoopertweets/status/972678963326783488,2018-03-11 03:41:51,5625,3653,106,13037,159
37476,958549075539955712,The Democrats pulled out a Kennedy speaking Sp...,Yamiche,/Yamiche/status/958549075539955712,2018-01-31 03:54:43,6467,1572,145,9756,274
39278,958514984497147904,Democrats will be wearing all black at tonight...,PrisonPlanet,/PrisonPlanet/status/958514984497147904,2018-01-31 01:39:15,4816,1974,191,8955,274
156862,940415771376738304,56 U.S. House Democratic women seek probe of T...,AynRandPaulRyan,/AynRandPaulRyan/status/940415771376738304,2017-12-12 02:59:26,4288,2123,126,8660,274


In [219]:
path = 'politics.json'
politics.to_json(path_or_buf= path ,orient='records')

### Cluster Group 2 = Workplace

In [220]:
cluster_group_2 = data.loc[data.cluster.isin([244, 333, 362]),]
print(cluster_group_2.shape)

(7392, 15)


In [221]:
filtered_group_2 = cluster_group_2.nlargest(1000, 'engagement')
filtered_group_2.shape

(1000, 15)

In [222]:
workplace = pd.DataFrame(data = filtered_group_2, columns = ['id','original_text','user','url','timestamp','likes','retweets','replies','engagement','cluster'])
workplace.head()

Unnamed: 0,id,original_text,user,url,timestamp,likes,retweets,replies,engagement,cluster
338528,950086665439272960,1 in 3 women have been sexually harassed in th...,Alyssa_Milano,/Alyssa_Milano/status/950086665439272962,2018-01-07 19:28:07,6841,3085,293,13304,333
273001,952773184931139584,"The #metoo conversation is getting murky, but ...",laurenduca,/laurenduca/status/952773184931139584,2018-01-15 05:23:23,2747,262,23,3294,333
164204,963836829572071424,A man that works in my building at work approa...,Patrici15767099,/Patrici15767099/status/963836829572071424,2018-02-14 18:06:22,1461,346,118,2271,244
56509,966550741396619264,A new survey finds 81 percent of women in Amer...,NPR,/NPR/status/966550741396619265,2018-02-22 05:50:29,618,319,65,1321,333
110998,941834629073453056,.@LisaBloom is also hurting women in the workp...,bacon_texas,/bacon_texas/status/941834629073453056,2017-12-16 00:57:28,440,283,40,1046,362


In [223]:
path = 'workplace.json'
workplace.to_json(path_or_buf= path ,orient='records')

### Cluster Group 3 = Toxic Language

In [254]:
cluster_group_3 = data.loc[data.cluster.isin([104, 107, 192, 22]),]
print(cluster_group_3.shape)

(10011, 15)


In [255]:
filtered_group_3 = cluster_group_3.nlargest(1000, 'engagement')
filtered_group_3.shape

(1000, 15)

In [256]:
toxic = pd.DataFrame(data = filtered_group_3, columns = ['id','original_text','user','url','timestamp','likes','retweets','replies','engagement','cluster'])
toxic.head()

Unnamed: 0,id,original_text,user,url,timestamp,likes,retweets,replies,engagement,cluster
1247,980492707809312768,You are implying that women with certain jobs ...,StormyDaniels,/StormyDaniels/status/980492707809312768,2018-04-01 17:10:52,57804,9000,1127,76931,104
219306,938441664439369728,Don't fucking making fun of Taylor Swift & min...,jillboard,/jillboard/status/938441664439369728,2017-12-06 16:15:02,29121,7812,238,44983,107
87928,933468165220089856,THIS IS SO INFURIATING. THIS IS SO DISGUSTING....,Alyssa_Milano,/Alyssa_Milano/status/933468165220089861,2017-11-22 22:52:08,4090,2807,341,10045,104
67643,982713584818978816,Among the many gross notions that are embedded...,emilynussbaum,/emilynussbaum/status/982713584818978816,2018-04-07 20:15:51,4972,993,92,7050,104
71170,982661301842403328,Tony Robbins has made millions of dollars putt...,eorlins,/eorlins/status/982661301842403328,2018-04-07 16:48:05,3931,1290,203,6714,104


In [257]:
path = 'toxic.json'
toxic.to_json(path_or_buf= path ,orient='records')

### Cluster Group 4 = Support

In [245]:
cluster_group_4 = data.loc[data.cluster.isin([197,230,286,371]),]
print(cluster_group_3.shape)

(11679, 15)


In [246]:
filtered_group_4 = cluster_group_4.nlargest(1000, 'engagement')
filtered_group_4.shape

(1000, 15)

In [247]:
support = pd.DataFrame(data = filtered_group_4, columns = ['id','original_text','user','url','timestamp','likes','retweets','replies','engagement','cluster'])
support.head()

Unnamed: 0,id,original_text,user,url,timestamp,likes,retweets,replies,engagement,cluster
338519,950086987863662592,Michelle Williams is bringing founder of the #...,Variety,/Variety/status/950086987863662592,2018-01-07 19:29:24,4632,1148,73,7001,230
50286,920313515705028608,"Meet @TaranaBurke, the Activist Who Started #M...",democracynow,/democracynow/status/920313515705028608,2017-10-17 15:40:15,2662,1443,32,5580,197
12354,948733639512281088,"Today’s #WCW is @TaranaBurke, creator of the #...",itsgabrielleu,/itsgabrielleu/status/948733639512281094,2018-01-04 01:51:40,2759,1107,23,4996,230
182254,956531745737895936,Woke up with the brave Olympic gymnast and all...,TaranaBurke,/TaranaBurke/status/956531745737895937,2018-01-25 14:18:34,3479,601,17,4698,286
154551,954478137865367552,"Today, I stood up to the enemy that sexually a...",hmorrow6,/hmorrow6/status/954478137865367557,2018-01-19 22:18:16,3434,366,76,4242,286


In [248]:
path = 'support.json'
support.to_json(path_or_buf= path ,orient='records')

### Cluster Group 5 = Uplifting

In [259]:
cluster_group_5 = data.loc[data.cluster.isin([163, 169, 217, 272, 281, 331, 347, 356, 383]),]
print(cluster_group_5.shape)

(26130, 15)


In [260]:
filtered_group_5 = cluster_group_5.nlargest(1000, 'engagement')
filtered_group_5.shape

(1000, 15)

In [261]:
uplifting = pd.DataFrame(data = filtered_group_5, columns = ['id','original_text','user','url','timestamp','likes','retweets','replies','engagement','cluster'])
uplifting.head()

Unnamed: 0,id,original_text,user,url,timestamp,likes,retweets,replies,engagement,cluster
20282,982389487836778496,Life coach Tony Robbins says women are using #...,nowthisnews,/nowthisnews/status/982389487836778496,2018-04-06 22:48:00,27988,12831,5701,59351,281
40748,922826342890131456,One tweet has brought together 1.7 million voi...,Alyssa_Milano,/Alyssa_Milano/status/922826342890131456,2017-10-24 14:05:20,27905,5001,975,38882,347
106993,942029266543435776,Shout out to all the strong and beautiful wome...,LaurenJauregui,/LaurenJauregui/status/942029266543435778,2017-12-16 13:50:53,18960,5837,439,31073,169
20413,946733185488113664,#MeToo Let's change this - Women rule the worl...,vonstolk,/vonstolk/status/946733185488113665,2017-12-29 13:22:35,15997,3432,2,22863,356
37547,958548768537923584,"“You bravely say, #metoo. You steadfastly say,...",RepJoeKennedy,/RepJoeKennedy/status/958548768537923584,2018-01-31 03:53:30,9883,2836,430,15985,281


In [262]:
path = 'uplifting.json'
uplifting.to_json(path_or_buf= path ,orient='records')

### Cluster Group 6 = Oprah

In [266]:
cluster_group_6 = data.loc[data.cluster.isin([323, 350, 374]),]
print(cluster_group_6.shape)

(2841, 15)


In [267]:
filtered_group_6 = cluster_group_5.nlargest(1000, 'engagement')
filtered_group_6.shape

(1000, 15)

In [268]:
oprah = pd.DataFrame(data = filtered_group_6, columns = ['id','original_text','user','url','timestamp','likes','retweets','replies','engagement','cluster'])
oprah.head()

Unnamed: 0,id,original_text,user,url,timestamp,likes,retweets,replies,engagement,cluster
296719,950754335578783744,How can Oprah be a credible voice for abused w...,PrisonPlanet,/PrisonPlanet/status/950754335578783744,2018-01-09 15:41:12,3834,1808,364,7814,374
296074,950768408223678464,"Oprah:\n""We will reach a time when no woman ha...",allidoisowen,/allidoisowen/status/950768408223678464,2018-01-09 16:37:07,3360,1192,99,5843,374
300980,950608721167765504,Flashback: British Actress Says Weinstein Used...,chuckwoolery,/chuckwoolery/status/950608721167765510,2018-01-09 06:02:35,1979,1537,168,5221,323
322094,950253288162652160,Oprah Winfrey celebrates Time’s Up & #metoo as...,HuffPost,/HuffPost/status/950253288162652160,2018-01-08 06:30:13,2377,710,70,3867,374
295774,950774413804298240,#MyTake on Oprah and the wondertards of Hollyw...,MarcusBrutus_,/MarcusBrutus_/status/950774413804298240,2018-01-09 17:00:59,1143,735,136,2749,374


In [269]:
path = 'oprah.json'
oprah.to_json(path_or_buf= path ,orient='records')